This project is an exploratory analysis inspired by an interest in working through a problem similar to a real-world NBA data science workflow. The goal was to engage with the data, explore patterns, and practice applying data science techniques in a realistic setting. Throughout the analysis, I work through the dataset step by step, allowing insights and questions to emerge naturally. The focus is on clear, concise code and effective visualizations, with an emphasis on producing work that mirrors how data science problems are approached in practice.
library(tidyverse)
schedule <- read_csv("schedule.csv")
draft_schedule <- read_csv("schedule_24_partial.csv")
locations <- read_csv("locations.csv")
game_data <- read_csv("team_game_data.csv")
In this section, I analyze NBA scheduling data to understand how game density and rest patterns affect teams, focusing on workload rather than on-court talent. I move step by step from defining a clear fatigue metric, to placing teams in historical context, to interpreting whether observed differences are meaningful or just noise.
I begin by defining a concrete measure of schedule density: 4 games in 6 nights. For the Thunder’s 2024–25 draft schedule, I explicitly identify each game that represents the fourth game played within a rolling six-day window. I allow these windows to overlap, since the goal is to capture how often players are exposed to peak workload conditions rather than to count distinct stretches. This gives me a precise count of how frequently OKC faces high-density situations in the upcoming season.
Next, I broaden the scope to the entire league and look historically from 2014–15 through 2023–24. I calculate how many 4-in-6 situations each team experiences per season and then normalize those counts to a standard 82-game season. This lets me establish a league-wide baseline for what a “typical” workload looks like and determine whether OKC’s draft schedule is heavier or lighter than average.
After establishing the average, I compare teams against one another to identify which franchises consistently experience the most and least schedule density. By averaging across seasons, I find the teams with the highest and lowest long-run exposure to 4-in-6 stretches. This comparison helps distinguish whether differences are driven by structural scheduling patterns rather than by one-off seasons.
I then assess whether the gap between the most and least affected teams is actually meaningful. To do this, I run a permutation test that simulates how large the difference would be if schedules were effectively random across teams. By comparing the observed gap to this null distribution, I determine that the difference is not particularly surprising and is likely consistent with random variation rather than systematic bias.
Finally, I shift from schedule structure to in-game performance under fatigue conditions. Using Brooklyn as a case study, I calculate their defensive effective field goal percentage during the 2023–24 season and compare it to their defensive performance when opponents are playing on the second night of a back-to-back. This provides a concrete example of how schedule-related fatigue can translate into measurable changes in on-court outcomes.
four_in_six = function(dates) {
dates = as.Date(dates)
vapply(dates, function(d) sum(dates >= (d - 5) & dates <= d) == 4L, logical(1))
}
okc_4in6 = draft_schedule %>%
mutate(gamedate = as.Date(gamedate)) %>%
filter(team == "OKC") %>%
arrange(gamedate) %>%
mutate(is_4in6 = four_in_six(gamedate))
sum(okc_4in6$is_4in6)
## [1] 26
26 4-in-6 stretches in OKC’s draft schedule.
team_rows = function(df) {
if (all(c("home_team", "away_team") %in% names(df)))
{
home = df %>% transmute(season, gamedate = as.Date(gamedate), team = home_team)
away = df %>% transmute(season, gamedate = as.Date(gamedate), team = away_team)
bind_rows(home, away)
}
else {
df %>% transmute(season, gamedate = as.Date(gamedate), team)
}
}
team_sched = team_rows(schedule) %>%
filter(season >= 2014, season <= 2023) %>%
arrange(team, season, gamedate)
per_team_season = team_sched %>%
group_by(team, season) %>%
arrange(gamedate, .by_group = TRUE) %>%
mutate(is_4in6 = four_in_six(gamedate)) %>%
summarise(games = n(), four_in_six = sum(is_4in6), .groups = "drop") %>%
mutate(four_in_six_per82 = four_in_six * (82 / games))
avg_per82_4in6 = mean(per_team_season$four_in_six_per82, na.rm = TRUE)
avg_per82_4in6
## [1] 25.09998
25.1 4-in-6 stretches on average.
team_avgs = per_team_season %>%
group_by(team) %>%
summarise(avg_4in6_per82 = mean(four_in_six_per82, na.rm = TRUE), .groups = "drop")
most_4in6 = team_avgs %>% slice_max(avg_4in6_per82, n = 1)
fewest_4in6 = team_avgs %>% slice_min(avg_4in6_per82, n = 1)
most_4in6
## # A tibble: 1 × 2
## team avg_4in6_per82
## <chr> <dbl>
## 1 CHA 28.1
fewest_4in6
## # A tibble: 1 × 2
## team avg_4in6_per82
## <chr> <dbl>
## 1 NYK 22.2
highlight_teams = c("CHA", "NYK")
plot_df = team_avgs %>%
mutate(highlight = dplyr::case_when(team == "OKC" ~ "OKC", team %in%
highlight_teams ~ "Highlighted", TRUE ~ "Other"))
bold_labels <- c("CHA","NYK","OKC")
ggplot(plot_df, aes(x = reorder(team, avg_4in6_per82), y = avg_4in6_per82, fill = highlight)) +
geom_col() +
coord_flip() +
geom_text(aes(label = sprintf("%.1f", avg_4in6_per82)), hjust = -0.1, size = 3) +
scale_y_continuous(expand = expansion(mult = c(0, 0.10))) +
scale_fill_manual(values = c("Other" = "steelblue","Highlighted" = "burlywood",
"OKC" = "darkorange"), guide = "none") +
labs(
title = "Per-82 Average 4-in-6 Stretches by Team (2014–15 to 2023–24)",
subtitle = "Charlotte Hornets, New York Knicks highlighted; OKC in orange",
x = "Teams", y = "Average 4-in-6 per 82 games") + theme_minimal(base_size = 10)
set.seed(42)
team_avg_diff = diff(range(team_avgs$avg_4in6_per82, na.rm = TRUE))
perm = function(df) {
df %>%
group_by(season) %>%
mutate(team = sample(team)) %>%
ungroup() %>%
group_by(team) %>%
summarise(avg_4in6_per82 = mean(four_in_six_per82, na.rm = TRUE), .groups = "drop") %>%
summarise(gap = max(avg_4in6_per82) - min(avg_4in6_per82)) %>%
pull(gap)
}
B = 5000
null_diff = replicate(B, perm(per_team_season))
p_val = mean(null_diff >= team_avg_diff)
q95 = quantile(null_diff, 0.95)
list(observed_gap = team_avg_diff, p_value = p_val, null_95th_percentile = q95)
## $observed_gap
## [1] 5.923077
##
## $p_value
## [1] 0.0664
##
## $null_95th_percentile
## 95%
## 6.044829
gd = game_data %>%
mutate(gamedate = as.Date(gamedate), season = as.integer(season)) %>%
group_by(season, off_team) %>% arrange(gamedate, .by_group = TRUE) %>%
mutate(is_second_b2b = as.integer(gamedate - lag(gamedate) == 1)) %>% ungroup()
opp_vs_bkn_2023 = gd %>% filter(season == 2023, def_team %in% c("BKN","BRK"), fgattempted > 0)
bkn_def_efg = 100 * with(opp_vs_bkn_2023,
sum(fgmade + 0.5 * fg3made, na.rm = TRUE) / sum(fgattempted, na.rm = TRUE))
opp_b2b = opp_vs_bkn_2023 %>% filter(is_second_b2b == 1)
bkn_def_efg_b2b = 100 * with(opp_b2b,
sum(fgmade + 0.5 * fg3made, na.rm = TRUE) / sum(fgattempted, na.rm = TRUE))
sprintf("BKN Defensive eFG%%: %.1f%%", bkn_def_efg)
## [1] "BKN Defensive eFG%: 54.3%"
sprintf("When opponent on a B2B (second night): %.1f%%", bkn_def_efg_b2b)
## [1] "When opponent on a B2B (second night): 53.5%"
In this section, I use a sequence of visualizations to move from league-wide trends to team-level, season-specific context, showing how schedule structure has evolved and how it plays out in practice.
I start with the first graph, which tracks the share of games played on the second night of a back-to-back by season. I compute this rate for each year and plot it over time to evaluate whether the league has reduced compressed scheduling. The visualization shows a clear downward trend, and I annotate the first and most recent seasons to make the magnitude of that change immediately clear. This establishes that rest-related scheduling has improved meaningfully over the past decade.
I then move to the second graph, which focuses on average inter-city travel distance per game. Using team latitude and longitude data, I calculate the distance between game locations with the Haversine formula and aggregate those distances by season. This visualization shows that, unlike back-to-backs, average travel distance has remained relatively flat over time, with only small fluctuations. Together, the first two graphs show that while the league has been able to reduce schedule density, geographic constraints limit how much travel can be optimized.
Finally, I use the third and final visualization, an interactive, season-level schedule plot, to translate those league-wide trends into something tangible at the team level. This tool displays the full 2024–25 draft schedules for OKC and DEN across the season, highlighting home and road games, rest gaps, back-to-backs, and 4-in-6 stretches. By making the plot interactive, the viewer can quickly identify dense clusters of games, long home stands, extended road trips, and high-stress periods that are not obvious from summary statistics alone.
This final visualization bridges the gap between trends and application. While the first two graphs provide historical and structural context, the interactive schedule plot shows how those constraints actually manifest for a specific team, helping frame practical decisions around preparation, rest, and rotation management.
load_csv = function(fname) {
if (!file.exists(fname)) stop("Missing file: ", fname)
readr::read_csv(fname, show_col_types = FALSE)
}
if (!exists("schedule")) schedule = load_csv("schedule.csv")
if (!exists("locations")) locations = load_csv("locations.csv")
names(schedule) = tolower(names(schedule))
names(locations) = tolower(names(locations))
if (!"gamedate" %in% names(schedule)) {
dc = intersect(names(schedule), c("game_date","date"))
if (length(dc) == 0) stop("schedule must have gamedate/game_date/date")
schedule$gamedate = as.Date(schedule[[dc[1]]])
} else {
schedule$gamedate = as.Date(schedule$gamedate)
}
if (!"season" %in% names(schedule)) {
schedule$season <- as.integer(format(schedule$gamedate, "%Y"))
}
stopifnot(all(c("team","opponent") %in% names(schedule)))
pick1 = function(nm, choices) { hit <- intersect(nm, choices); if (length(hit)) hit[1] else NA_character_ }
ln = names(locations)
team_col = pick1(ln, c("team","team_name","name"))
lat_col = pick1(ln, c("latitude","lat"))
lon_col = pick1(ln, c("longitude","lon","lng","long"))
if (any(is.na(c(team_col, lat_col, lon_col))))
stop("locations must include team + latitude + longitude (or clear variants)")
locations = locations %>%
rename(team = all_of(team_col),
latitude = all_of(lat_col),
longitude = all_of(lon_col))
rest_days = schedule %>%
arrange(team, gamedate) %>%
group_by(team) %>%
mutate(rest_days = as.numeric(gamedate - dplyr::lag(gamedate))) %>%
ungroup()
b2b_by_season = rest_days %>%
mutate(b2b = rest_days == 1) %>%
group_by(season) %>%
summarise(games = n(), avg_rest = mean(rest_days, na.rm = TRUE), b2b_rate = mean(b2b, na.rm = TRUE), .groups = "drop")
haversine_km = function(lon1, lat1, lon2, lat2) {
R = 6371.0088; toRad <- pi/180
dlat = (lat2 - lat1) * toRad; dlon <- (lon2 - lon1) * toRad
a = sin(dlat/2)^2 + cos(lat1*toRad) * cos(lat2*toRad) * sin(dlon/2)^2
2 * R * asin(pmin(1, sqrt(a)))
}
loc_team = locations %>% select(team, team_lat = latitude, team_lon = longitude)
loc_opp = locations %>% select(opponent = team, opp_lat = latitude, opp_lon = longitude)
league_travel = schedule %>%
left_join(loc_team, by = "team") %>%
left_join(loc_opp, by = "opponent") %>%
mutate(dist_km = haversine_km(team_lon, team_lat, opp_lon, opp_lat))
travel_by_season = league_travel %>%
group_by(season) %>%
summarise(avg_dist_km = mean(dist_km, na.rm = TRUE), .groups = "drop")
theme_clean = function(base_size = 13){
theme_minimal(base_size = base_size) +
theme(
plot.title = element_text(face = "bold", size = base_size + 2, margin = margin(b = 6)),
plot.subtitle = element_text(color = "grey40", margin = margin(b = 10)),
axis.title = element_text(color = "grey25"),
axis.text = element_text(color = "grey25"),
panel.grid.minor = element_blank(),
panel.grid.major.x = element_line(color = "grey88"),
panel.grid.major.y = element_line(color = "grey90")
)
}
b2b_first = b2b_by_season %>% slice_min(season, n = 1)
b2b_last = b2b_by_season %>% slice_max(season, n = 1)
b2b_delta = b2b_last$b2b_rate - b2b_first$b2b_rate
p1 = ggplot(b2b_by_season, aes(season, b2b_rate)) +
geom_line(linewidth = 1.1) +
geom_point(size = 2.3) +
geom_smooth(method = "loess", se = TRUE, linewidth = 0.8, alpha = 0.15) +
geom_label(data = b2b_first,
aes(label = paste0(season, "\n", scales::percent(b2b_rate, .1))),
vjust = 1.2, label.size = 0, fill = "grey98") +
geom_label(data = b2b_last,
aes(label = paste0(season, "\n", scales::percent(b2b_rate, .1))),
vjust = -0.2, label.size = 0, fill = "grey98") +
scale_y_continuous(labels = scales::percent_format(accuracy = 1)) +
scale_x_continuous(breaks = scales::pretty_breaks()) +
labs(
title = "Share of Back-to-Back Games",
subtitle = paste0("Change: ", ifelse(b2b_delta >= 0, "+", ""),
scales::percent(b2b_delta, .1),
" from ", b2b_first$season, " to ", b2b_last$season),
x = "Season", y = "B2B share"
) +
theme_clean()
km_first = travel_by_season %>% slice_min(season, n = 1)
km_last = travel_by_season %>% slice_max(season, n = 1)
km_delta = km_last$avg_dist_km - km_first$avg_dist_km
p2 = ggplot(travel_by_season, aes(season, avg_dist_km)) +
geom_line(linewidth = 1.1) +
geom_point(size = 2.3) +
geom_smooth(method = "loess", se = TRUE, linewidth = 0.8, alpha = 0.15) +
geom_label(data = km_first,
aes(label = paste0(season, "\n", scales::comma(round(avg_dist_km, 1)), " km")),
vjust = 1.2, label.size = 0, fill = "grey98") +
geom_label(data = km_last,
aes(label = paste0(season, "\n", scales::comma(round(avg_dist_km, 1)), " km")),
vjust = -0.2, label.size = 0, fill = "grey98") +
scale_x_continuous(breaks = scales::pretty_breaks()) +
labs(
title = "Average Inter-City Distance per Game",
subtitle = paste0("Change: ", ifelse(km_delta >= 0, "+", ""),
scales::comma(round(km_delta, 1)),
" km from ", km_first$season, " to ", km_last$season),
x = "Season", y = "Average distance (km)"
) + theme_clean()
if (requireNamespace("patchwork", quietly = TRUE)) {
p1 + p2 + patchwork::plot_layout(ncol = 2)
} else {
print(p1); print(p2)
}
cat("- Back-to-back frequency has ",
ifelse(b2b_delta < 0, "decreased", "increased"), " by ",
scales::percent(abs(b2b_delta), .1), ".\n", sep = "")
## - Back-to-back frequency has decreased by 6.7%.
cat("- Average per-game travel distance has ",
ifelse(km_delta < 0, "decreased", "increased"), " by ",
scales::comma(round(abs(km_delta), 1)), " km.\n", sep = "")
## - Average per-game travel distance has decreased by 4 km.
library(plotly)
four_in_six = function(dates) {dates = as.Date(dates)
vapply(dates, function(d) sum(dates >= (d - 5) & dates <= d) == 4L, logical(1))}
pick_col = function(df, candidates, label) {hit = candidates[candidates %in% names(df)]
if (!length(hit)) stop("Missing column for ", label,
" (tried: ", paste(candidates, collapse = ", "), ")")
df[[hit[1]]]}
draft = draft_schedule %>%
transmute(
gamedate = as.Date(pick_col(., c("gamedate","game_date","date"), "gamedate")),
team = as.character(pick_col(., c("team","team_abbr"), "team")),
opponent = as.character(pick_col(., c("opponent","opp","opponent_abbr"), "opponent")),
home_raw = pick_col(., c("home","is_home","home_flag","home_game","homeAway"), "home flag")
) %>%
mutate(
home = case_when(
is.logical(home_raw) ~ as.integer(home_raw),
is.numeric(home_raw) ~ as.integer(home_raw),
is.character(home_raw) ~ as.integer(home_raw %in% c("H","Home","HOME","1","TRUE","T")),
TRUE ~ NA_integer_
),
home_away = if_else(home == 1L, "Home", "Away")
) %>%
filter(team %in% c("OKC","DEN")) %>%
arrange(team, gamedate)
seq_df = draft %>%
group_by(team) %>%
arrange(gamedate, .by_group = TRUE) %>%
mutate(
lane = if_else(home_away == "Home", 1, 0), # 1 = Home, 0 = Away
rest_days = as.numeric(gamedate - lag(gamedate)),
rest_bucket = case_when(is.na(rest_days) ~ NA_character_, rest_days == 0 ~ "B2B (0d)",
rest_days == 1 ~ "1 day", TRUE ~ "2+ days"),
in_4in6 = four_in_six(gamedate),
hover = paste0(
"<b>", team, "</b> vs <b>", opponent, "</b> ",
if_else(home_away == "Home","(Home)","(Away)"),
"<br><b>Date:</b> ", format(gamedate, "%a, %b %d, %Y"),
"<br><b>Rest:</b> ", if_else(is.na(rest_days), "—", paste0(rest_days, " day(s)")),
if_else(in_4in6, "<br><b>In 4-in-6:</b> Yes", ""))) %>% ungroup()
runs = seq_df %>%
group_by(team) %>%
arrange(gamedate, .by_group = TRUE) %>%
mutate(change = home_away != lag(home_away), run_id = cumsum(replace_na(change, FALSE))) %>%
group_by(team, run_id, home_away) %>%
summarise(x0 = min(gamedate), x1 = max(gamedate) + 1, .groups = "drop") %>%
mutate(days = as.numeric(x1 - x0)) %>%
filter(days >= 4)
dens_bands = seq_df %>%
group_by(team) %>%
arrange(gamedate, .by_group = TRUE) %>%
mutate(block = cumsum(in_4in6 != lag(in_4in6, default = FALSE))) %>%
filter(in_4in6) %>%
group_by(team, block) %>%
summarise(x0 = min(gamedate), x1 = max(gamedate) + 1, .groups = "drop_last") %>%
select(team, x0, x1) %>% ungroup()
longest_run_label = function(team_name, side, runs_tbl, games_tbl) {
rbt = runs_tbl %>% dplyr::filter(team == team_name, home_away == side)
if (nrow(rbt) == 0) return(NULL)
rbt = rbt %>%
rowwise() %>%
mutate(games_in_run = sum(
games_tbl$team == team_name &
games_tbl$home_away == side &
games_tbl$gamedate >= x0 & games_tbl$gamedate < x1)) %>% ungroup()
best = rbt %>% slice_max(games_in_run, n = 1, with_ties = FALSE)
midx = best$x0 + (best$x1 - best$x0) / 2
title = if (side == "Home") "Longest homestand" else "Longest road trip"
txt = sprintf("%s: %dg (%s–%s)", title, best$games_in_run, format(best$x0, "%b %d"),
format(best$x1 - 1, "%b %d"))
list(x = midx, text = txt)}
make_team_fig = function(tname, showleg = TRUE) {
d = dplyr::filter(seq_df, team == tname)
rb = dplyr::filter(runs, team == tname)
db = dplyr::filter(dens_bands, team == tname)
fig = plotly::plot_ly()
# 4-in-6 bands
if (nrow(db)) {
for (i in seq_len(nrow(db))) {
fig = fig %>% plotly::add_trace(
type = "scatter", mode = "lines",
x = c(db$x0[i], db$x1[i], db$x1[i], db$x0[i]),
y = c(-0.4, -0.4, 1.4, 1.4),
hoverinfo = "skip", fill = "toself",
fillcolor = "rgba(137,99,255,0.15)",
line = list(width = 0), showlegend = FALSE, inherit = FALSE)}}
# long home/road runs
if (nrow(rb)) {
for (i in seq_len(nrow(rb))) {
fcol = if (rb$home_away[i] == "Home")
"rgba(122,203,119,0.16)" else "rgba(230,124,115,0.16)"
fig = fig %>% plotly::add_trace(
type = "scatter", mode = "lines",
x = c(rb$x0[i], rb$x1[i], rb$x1[i], rb$x0[i]),
y = c(-0.4, -0.4, 1.4, 1.4),
hoverinfo = "skip", fill = "toself", fillcolor = fcol,
line = list(width = 0), showlegend = FALSE, inherit = FALSE)}
}
fig = fig %>%
plotly::add_markers(
data = d,
x = ~gamedate, y = ~lane,
color = ~rest_bucket,
colors = c("B2B (0d)"="#D73027","1 day"="#FC8D59","2+ days"="#1A9850"),
symbol = ~home_away, symbols = c(Home="circle", Away="triangle-up"),
marker = list(size = 10, line = list(width = 0.6, color = "white")),
hovertemplate = ~paste0(hover, "<extra></extra>"),
showlegend = showleg
) %>%
plotly::layout(
yaxis = list(title = "", tickvals = c(0,1), ticktext = c("Away","Home"),
range = c(-0.5, 1.5), zeroline = FALSE),
xaxis = list(title = "", tickformat = "%b", dtick = "M1"),
title = list(text = tname, x = 0.01, y = 0.98,
xanchor = "left", yanchor = "top")
)
lab_home = longest_run_label(tname, "Home", runs, seq_df)
if (!is.null(lab_home)) {
fig = fig %>% plotly::add_annotations(
x = lab_home$x, y = 1.32, xref = "x", yref = "y",
text = lab_home$text, showarrow = FALSE,
bgcolor = "rgba(122,203,119,0.20)", bordercolor = "#7ACB77",
borderwidth = 1, font = list(size = 11, color = "#1b4d1b"),
align = "center"
)
}
lab_away = longest_run_label(tname, "Away", runs, seq_df)
if (!is.null(lab_away)) {
fig = fig %>% plotly::add_annotations(
x = lab_away$x, y = -0.32, xref = "x", yref = "y",
text = lab_away$text, showarrow = FALSE,
bgcolor = "rgba(230,124,115,0.20)", bordercolor = "#E67C73",
borderwidth = 1, font = list(size = 11, color = "#6a1a14"),
align = "center"
)
}
# 4-in-6 badge
n_4in6 = sum(d$in_4in6, na.rm = TRUE)
fig = fig %>% plotly::add_annotations(
xref = "paper", yref = "paper", x = 0.98, y = 1.12,
text = paste0("4-in-6: <b>", n_4in6, "</b>"),
showarrow = FALSE, align = "right",
bgcolor = "rgba(233,224,255,0.7)", bordercolor = "rgba(137,99,255,0.6)",
borderwidth = 1, font = list(size = 11)
)
fig
}
fig_den = make_team_fig("DEN", showleg = TRUE)
fig_okc = make_team_fig("OKC", showleg = FALSE)
fig = subplot(fig_okc, fig_den, nrows = 1, shareY = TRUE, titleX = TRUE) %>%
layout(
height = 650,
margin = list(l = 80, r = 20, t = 70, b = 80),
legend = list(orientation = "h", x = 0, y = -0.15),
title = list(text = "Interactive 2024–25 Schedules - OKC & DEN",
x = 0.02, xanchor = "left"),
hoverlabel = list(bgcolor = "white")
) %>%
layout(
xaxis = list(tickformat = "%b", rangeslider = list(visible = TRUE)),
xaxis2 = list(tickformat = "%b", rangeslider = list(visible = TRUE))
)
fig = fig %>%
add_annotations(text = "<b>OKC</b>", x = 0.25, y = 1.10,
xref = "paper", yref = "paper",
showarrow = FALSE, font = list(size = 16)) %>%
add_annotations(text = "<b>DEN</b>", x = 0.75, y = 1.10,
xref = "paper", yref = "paper",
showarrow = FALSE, font = list(size = 16))
fig = fig %>%
layout(hovermode = "x unified") %>%
config(displaylogo = FALSE,
modeBarButtonsToRemove = c("select2d","lasso2d"))
fig = fig %>%
add_annotations(
xref="paper", yref="paper", x=0.5, y=-0.12, showarrow=FALSE,
text="● Home ▲ Away", font=list(size=12, color="rgba(0,0,0,0.65)")
)
fig
.
.
.
In this section, I build a simple model to estimate how much a team’s schedule has helped or hurt its total regular-season wins due to schedule-related factors from the 2019–20 through 2023–24 seasons. The goal is not to predict wins, but to isolate the portion of win totals that can reasonably be attributed to schedule structure, rather than team quality.
I start by standardizing the schedule data across seasons, ensuring consistent game dates, team identifiers, and home/away flags. This allows me to reliably track how many games each team plays at home versus on the road over the full multi-season window. I restrict the analysis to games from 2019–20 onward to focus on a modern scheduling context.
To translate schedule structure into wins, I estimate a league-average home-court advantage directly from the data. I calculate the difference between home win percentage and away win percentage across all games in the sample, and cap this estimate within a reasonable range to avoid extreme values driven by noise. This produces a conservative, data-driven estimate of how much playing at home is worth in terms of win probability.
Using this estimate, I then compute each team’s home-game imbalance, which is how much their share of home games deviates from a perfectly balanced 50/50 split. I multiply that imbalance by the total number of games played and by the estimated home-court edge to convert schedule structure into an estimated number of wins gained or lost purely due to scheduling.
I aggregate these effects at the team level across all seasons in the sample, producing a single number per team that represents how much the schedule has helped or hurt them overall. I then visualize these estimates in a bar chart centered at zero, making it easy to compare teams that benefited from schedule imbalance to those that were disadvantaged.
Finally, I identify the teams most helped and most hurt by schedule effects. The results show that while the magnitude of these effects is modest, they are still meaningful at the margins, especially in a league where playoff seeding and tiebreakers often come down to one or two games.
Overall, this modeling approach provides a rough but interpretable estimate of schedule-driven win impact. It intentionally prioritizes transparency and realism over complexity, and the results should be interpreted as directional rather than exact, capturing how schedule structure alone can influence outcomes over time.
schedule = schedule %>%
mutate(gamedate = as.Date(if ("gamedate" %in% names(schedule)) gamedate
else if ("game_date" %in% names(schedule)) game_date
else date))
home_away = {
s = schedule
has = function(x) x %in% names(s)
if (has("home")) {
s = s %>%
mutate(home = case_when(
is.logical(home) ~ home,
tolower(as.character(home)) %in% c("1","t","true","yes","y","h","home") ~ TRUE,
tolower(as.character(home)) %in% c("0","f","false","no","n","a","away") ~ FALSE,
TRUE ~ NA
))
} else if (has("home_team") && has("team")) {
s = s %>% mutate(home = team == .data$home_team)
} else if (has("homeaway")) {
s = s %>% mutate(home = tolower(as.character(homeaway)) %in% c("home","h"))
} else if (has("location")) {
s = s %>% mutate(home = tolower(as.character(location)) %in% c("home","h"))
} else {
s = s %>% mutate(home = NA)
}
s %>% transmute(team, gamedate, home = as.logical(home))
}
if (!exists("gd_feat")) {
gd_feat = schedule %>%
select(team, opponent, gamedate) %>%
left_join(home_away, by = c("team","gamedate")) %>%
mutate(rest_days = NA_real_, dist_km = NA_real_)
}
ha_19_24 = home_away %>%
filter(year(gamedate) >= 2019, year(gamedate) <= 2024)
win_df = NULL
if (exists("gd_feat") && all(c("team","gamedate","win") %in% names(gd_feat))) {
win_df = gd_feat %>%
transmute(team, gamedate = as.Date(gamedate), win = as.numeric(as.logical(win)))
} else if ("win" %in% names(schedule)) {
win_df = schedule %>%
transmute(team, gamedate = as.Date(if ("gamedate" %in% names(schedule)) gamedate else
if ("game_date" %in% names(schedule)) game_date else date), win = as.numeric(as.logical(win)))
}
ha_out = ha_19_24 %>%
left_join(win_df, by = c("team","gamedate"))
estimate_edge = function(df) {
if (!"win" %in% names(df) || all(is.na(df$win))) return(0.14)
df2 = df %>% filter(!is.na(home), !is.na(win))
if (nrow(df2) < 100) return(0.14)
home_wp = mean(df2$win[df2$home %in% TRUE], na.rm = TRUE)
away_wp = mean(df2$win[df2$home %in% FALSE], na.rm = TRUE)
edge = home_wp - away_wp
pmin(pmax(edge, 0.08), 0.20)
}
home_edge = estimate_edge(ha_out)
team_totals = ha_19_24 %>%
group_by(team) %>%
summarise(
games = n(),
home_share = mean(home, na.rm = TRUE),
.groups = "drop"
) %>%
mutate(schedule_wins = (home_share - 0.5) * games * home_edge) %>%
arrange(desc(schedule_wins))
most_helped = team_totals %>% slice_max(schedule_wins, n = 1)
most_hurt = team_totals %>% slice_min(schedule_wins, n = 1)
team_totals %>%
mutate(team = forcats::fct_reorder(team, schedule_wins)) %>%
ggplot(aes(team, schedule_wins)) +
geom_col() +
coord_flip() +
geom_hline(yintercept = 0, linetype = "dashed", linewidth = 0.6) +
scale_y_continuous(labels = function(x) sprintf("%+.1f", x)) +
labs(
title = "Estimated Wins Gained/Lost from Home-Share Imbalance (2019–2024)",
x = NULL, y = "Wins due to home-share imbalance"
) + theme_minimal(base_size = 12)